{
    "cells": [
        {
            "cell_type": "markdown",
            "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05",
            "metadata": {},
            "source": [
                "# Simple Index Demo"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119",
            "metadata": {},
            "source": [
                "#### Load documents, build the GPTSimpleVectorIndex"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 17,
            "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7",
            "metadata": {},
            "outputs": [],
            "source": [
                "import logging\n",
                "import sys\n",
                "\n",
                "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
                "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
                "\n",
                "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n",
                "from IPython.display import Markdown, display"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "id": "03d1691e-544b-454f-825b-5ee12f7faa8a",
            "metadata": {},
            "outputs": [],
            "source": [
                "# load documents\n",
                "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58",
            "metadata": {
                "scrolled": true
            },
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n",
                        "> [build_index_from_documents] Total LLM token usage: 0 tokens\n",
                        "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n",
                        "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n"
                    ]
                }
            ],
            "source": [
                "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=512)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "id": "2bbccf1d-ac39-427c-b3a3-f8e9d1d12348",
            "metadata": {},
            "outputs": [],
            "source": [
                "# save index to disk\n",
                "index.save_to_disk('index_simple.json')"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 18,
            "id": "197ca78e-1310-474d-91e3-877c3636b901",
            "metadata": {},
            "outputs": [],
            "source": [
                "# load index from disk\n",
                "index = GPTSimpleVectorIndex.load_from_disk('index_simple.json')"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "8b7d7c61-b5d7-4b8f-b90b-3ebee1103f27",
            "metadata": {},
            "source": [
                "#### Define Query + Langchain Output Parser"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "id": "6fb88295-0840-4e2d-b79b-def0b0a63a7f",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "from gpt_index.output_parsers import LangchainOutputParser\n",
                "from gpt_index.llm_predictor import StructuredLLMPredictor\n",
                "from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "llm_predictor = StructuredLLMPredictor()"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "bc25edf7-9343-4e82-a3f1-eec4281a9371",
            "metadata": {},
            "source": [
                "**Define custom QA and Refine Prompts**"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "id": "2833d086-d240-4798-b3c5-a83ac4593b0e",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt\n",
                "from gpt_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "id": "a4b9201d-fe16-4cc0-8135-a08d9928625d",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "response_schemas = [\n",
                "    ResponseSchema(name=\"Education\", description=\"Describes the author's educational experience/background.\"),\n",
                "    ResponseSchema(name=\"Work\", description=\"Describes the author's work experience/background.\")\n",
                "]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "id": "e73b87b8-90da-4ab8-9ff7-e40880277d9b",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "lc_output_parser = StructuredOutputParser.from_response_schemas(response_schemas)\n",
                "output_parser = LangchainOutputParser(lc_output_parser)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n",
                "# NOTE: here we add formatting instructions to the prompts.\n",
                "\n",
                "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n",
                "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n",
                "\n",
                "qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)\n",
                "refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "id": "1ba18a80-35f4-4fd4-9b13-9f13f84db4fe",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Context information is below. \n",
                        "---------------------\n",
                        "{context_str}\n",
                        "---------------------\n",
                        "Given the context information and not prior knowledge, answer the question: {query_str}\n",
                        "\n",
                        "\n",
                        "The output should be a markdown code snippet formatted in the following schema:\n",
                        "\n",
                        "```json\n",
                        "{{\n",
                        "\t\"Education\": string  // Describes the author's educational experience/background.\n",
                        "\t\"Work\": string  // Describes the author's work experience/background.\n",
                        "}}\n",
                        "```\n"
                    ]
                }
            ],
            "source": [
                "# take a look at the new QA template! \n",
                "print(fmt_qa_tmpl)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4",
            "metadata": {},
            "source": [
                "#### Query Index"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 19,
            "id": "fb9cdf43-0f31-4c36-869b-df9fa50aebdb",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 609 tokens\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "> [query] Total LLM token usage: 609 tokens\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "> [query] Total embedding token usage: 11 tokens\n"
                    ]
                }
            ],
            "source": [
                "response = index.query(\n",
                "    \"What are a few things the author did growing up?\", \n",
                "    text_qa_template=qa_prompt, \n",
                "    refine_template=refine_prompt, \n",
                "    llm_predictor=llm_predictor\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 20,
            "id": "bc7760b6-5be3-4303-b97e-3f5edacf674b",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "{'Education': 'Before college, the author wrote short stories and experimented with programming on an IBM 1401.', 'Work': 'The author worked on writing and programming outside of school.'}\n"
                    ]
                }
            ],
            "source": [
                "print(response)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "002a4b5f-51ac-437a-afe7-94e2687737a9",
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "llama_index",
            "language": "python",
            "name": "llama_index"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.10.10"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
}